Data Inspection¶

In [1]:
import plotly
plotly.offline.init_notebook_mode()
In [9]:
import pandas as pd
from tqdm import tqdm

df_list = list()
chunk_iter = pd.read_csv(
    "../data/Total_Data_10Y_Top24.csv", 
    chunksize=100000, 
    dtype = {"CANCELLATION_CODE": str}
)
for chunk in tqdm(chunk_iter):
    df_list.append(chunk)
df = pd.concat(df_list)
df.head()
130it [00:27,  4.66it/s]
Out[9]:
FL_DATE OP_UNIQUE_CARRIER TAIL_NUM ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_MARKET_ID ORIGIN ORIGIN_CITY_NAME DEST_AIRPORT_SEQ_ID DEST_CITY_MARKET_ID DEST ... TAXI_IN ARR_TIME ARR_DELAY CANCELLED CANCELLATION_CODE CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY
0 2014-07-01 AA N002AA 1105703 31057 CLT Charlotte, NC 1129803 30194 DFW ... 28.0 1214.0 9.0 0.0 NaN NaN NaN NaN NaN NaN
1 2014-07-01 AA N002AA 1129803 30194 DFW Dallas/Fort Worth, TX 1105703 31057 CLT ... 13.0 945.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
2 2014-07-01 AA N004AA 1039705 30397 ATL Atlanta, GA 1129803 30194 DFW ... 6.0 1341.0 -9.0 0.0 NaN NaN NaN NaN NaN NaN
3 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 7.0 1159.0 4.0 0.0 NaN NaN NaN NaN NaN NaN
4 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 6.0 2317.0 2.0 0.0 NaN NaN NaN NaN NaN NaN

5 rows × 24 columns

In [10]:
df.describe()
Out[10]:
ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_MARKET_ID DEST_AIRPORT_SEQ_ID DEST_CITY_MARKET_ID DEP_TIME DEP_DELAY TAXI_OUT TAXI_IN ARR_TIME ARR_DELAY CANCELLED CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY
count 1.292656e+07 1.292656e+07 1.292656e+07 1.292656e+07 1.271494e+07 1.271486e+07 1.271048e+07 1.270744e+07 1.270744e+07 1.268139e+07 1.292656e+07 2.559603e+06 2.559603e+06 2.559603e+06 2.559603e+06 2.559603e+06
mean 1.298824e+06 3.161338e+04 1.298812e+06 3.161370e+04 1.332409e+03 1.127152e+01 1.722694e+01 8.678440e+00 1.471801e+03 5.409690e+00 1.675551e-02 2.041558e+01 2.650336e+00 1.520350e+01 1.268451e-01 2.413592e+01
std 1.453419e+05 1.168922e+03 1.453231e+05 1.168763e+03 5.192380e+02 4.514114e+01 9.141093e+00 6.704624e+00 5.554134e+02 4.738376e+01 1.283540e-01 5.935956e+01 2.208897e+01 3.191662e+01 3.210375e+00 5.163883e+01
min 1.039705e+06 3.019400e+04 1.039705e+06 3.019400e+04 1.000000e+00 -2.340000e+02 0.000000e+00 1.000000e+00 1.000000e+00 -2.380000e+02 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 1.129806e+06 3.046600e+04 1.129806e+06 3.046600e+04 9.060000e+02 -5.000000e+00 1.200000e+01 5.000000e+00 1.052000e+03 -1.400000e+01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
50% 1.289208e+06 3.145400e+04 1.289208e+06 3.145400e+04 1.324000e+03 -1.000000e+00 1.500000e+01 7.000000e+00 1.515000e+03 -5.000000e+00 0.000000e+00 3.000000e+00 0.000000e+00 2.000000e+00 0.000000e+00 0.000000e+00
75% 1.410702e+06 3.245700e+04 1.410702e+06 3.245700e+04 1.751000e+03 9.000000e+00 2.000000e+01 1.000000e+01 1.930000e+03 9.000000e+00 0.000000e+00 2.000000e+01 0.000000e+00 1.900000e+01 0.000000e+00 2.800000e+01
max 1.501606e+06 3.481900e+04 1.501606e+06 3.481900e+04 2.400000e+03 3.695000e+03 2.270000e+02 1.419000e+03 2.400000e+03 3.680000e+03 1.000000e+00 3.359000e+03 2.692000e+03 1.511000e+03 9.870000e+02 3.581000e+03
In [11]:
df.FL_DATE.max()
Out[11]:
'2024-06-30'
In [5]:
df.nunique()
Out[5]:
FL_DATE                  3653
OP_UNIQUE_CARRIER          20
TAIL_NUM                 9140
ORIGIN_AIRPORT_SEQ_ID      58
ORIGIN_CITY_MARKET_ID      20
ORIGIN                     24
ORIGIN_CITY_NAME           24
DEST_AIRPORT_SEQ_ID        58
DEST_CITY_MARKET_ID        20
DEST                       24
DEST_CITY_NAME             24
DEP_TIME                 1440
DEP_DELAY                1783
TAXI_OUT                  195
TAXI_IN                   247
ARR_TIME                 1440
ARR_DELAY                1806
CANCELLED                   2
CANCELLATION_CODE           4
CARRIER_DELAY            1558
WEATHER_DELAY             971
NAS_DELAY                 855
SECURITY_DELAY            268
LATE_AIRCRAFT_DELAY      1228
dtype: int64
In [6]:
df.dtypes
Out[6]:
FL_DATE                   object
OP_UNIQUE_CARRIER         object
TAIL_NUM                  object
ORIGIN_AIRPORT_SEQ_ID      int64
ORIGIN_CITY_MARKET_ID      int64
ORIGIN                    object
ORIGIN_CITY_NAME          object
DEST_AIRPORT_SEQ_ID        int64
DEST_CITY_MARKET_ID        int64
DEST                      object
DEST_CITY_NAME            object
DEP_TIME                 float64
DEP_DELAY                float64
TAXI_OUT                 float64
TAXI_IN                  float64
ARR_TIME                 float64
ARR_DELAY                float64
CANCELLED                float64
CANCELLATION_CODE         object
CARRIER_DELAY            float64
WEATHER_DELAY            float64
NAS_DELAY                float64
SECURITY_DELAY           float64
LATE_AIRCRAFT_DELAY      float64
dtype: object

Data Cleaning¶

In [7]:
MEDIUM_AIRPORT_CODE = [
    "DAL",
    "PDX",
    "STL",
    "RDU",
    "HOU",
    "SMF",
    "MSY",
    "SJC",
    "SJU",
    "SNA"
]
In [12]:
import matplotlib.pyplot as plt
target_df = df.drop(columns = [
    "OP_UNIQUE_CARRIER",
    "ORIGIN_AIRPORT_SEQ_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN",
    "ORIGIN_CITY_NAME",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    "DEST_CITY_MARKET_ID",
    "DEST",
    "DEST_CITY_NAME",
    "DEST_AIRPORT_SEQ_ID",
    "DEP_TIME",
    "TAXI_OUT",
    "TAXI_IN",
    "ARR_TIME",
])
del df
target_df['FL_DATE'] = pd.to_datetime(target_df['FL_DATE'])
target_df.isnull().mean()
Out[12]:
FL_DATE                0.000000
TAIL_NUM               0.004830
DEP_DELAY              0.016377
ARR_DELAY              0.018966
CANCELLED              0.000000
CANCELLATION_CODE      0.983244
CARRIER_DELAY          0.801989
WEATHER_DELAY          0.801989
NAS_DELAY              0.801989
SECURITY_DELAY         0.801989
LATE_AIRCRAFT_DELAY    0.801989
dtype: float64
In [17]:
plane_set_df = target_df.TAIL_NUM.drop_duplicates()
plane_review = pd.read_csv("../supplementary/N-Number-Registration-Data-2024.csv")
plane_review['N-NUMBER'] = plane_review['N-NUMBER'].apply(lambda x:"N"+x)
plane_review = plane_review[plane_review['N-NUMBER'].isin(plane_set_df)]
plane_review
Out[17]:
N-NUMBER COUNTRY TYPE AIRCRAFT TYPE ENGINE MFR MODEL NO-ENG NO-SEATS AC-WEIGHT MFR_ENGINE MODEL_ENGINE HORSEPOWER THRUST
651 N10156 US Fixed wing multi engine Turbo-fan EMBRAER EMB-145XR 2 55 12,500 - 19,999 ROLLS-ROYC AE3007 SER 0.0 6442.0
813 N101DQ US Fixed wing multi engine Turbo-fan AIRBUS A321-211 2 199 20,000 and over CFM INTL CFM56-5B3/3 0.0 32000.0
816 N101DU US Fixed wing multi engine Turbo-fan C SERIES AIRCRAFT LTD PTNRSP BD-500-1A10 2 133 20,000 and over P & W PW1519G 0.0 19775.0
874 N101HQ US Fixed wing multi engine Turbo-fan EMBRAER-EMPRESA BRASILEIRA DE ERJ 170-200 LR 2 80 20,000 and over GE CF34-8E5 0.0 14510.0
946 N101NN US Fixed wing multi engine Turbo-fan AIRBUS INDUSTRIE A321-231 2 379 20,000 and over IAE V2500SERIES 0.0 25000.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
297051 N998AN US Fixed wing multi engine Turbo-fan AIRBUS A321-231 2 379 20,000 and over IAE V2533-A5 0.0 31600.0
297052 N998AT US Fixed wing multi engine Turbo-fan BOEING 717-200 2 100 20,000 and over ROLLS-ROYC TAY 651-54 0.0 15400.0
297102 N998JE US Fixed wing multi engine Turbo-fan AIRBUS A321-231 2 379 20,000 and over IAE V2533-A5 0.0 31600.0
297126 N998NN US Fixed wing multi engine Turbo-fan BOEING 737-800 2 175 20,000 and over CFM INTL CFM56-7B24E 0.0 24200.0
297463 N999JQ US Fixed wing multi engine Turbo-fan AIRBUS A321-231 2 379 20,000 and over IAE V2533-A5 0.0 31600.0

6798 rows × 13 columns

In [16]:
airport_review_count = airport_review.groupby("AIRPORT_CODE").size().reset_index().rename(columns = {0:"reviewCount"})
airport_review_merged = pd.merge(airport_review_count, airport_default_score, how = "inner", on = "AIRPORT_CODE")
airport_review_merged['Large'] = ~airport_review_merged.AIRPORT_CODE.isin(MEDIUM_AIRPORT_CODE)
airport_review_merged.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[16], line 1
----> 1 airport_review_count = airport_review.groupby("AIRPORT_CODE").size().reset_index().rename(columns = {0:"reviewCount"})
      2 airport_review_merged = pd.merge(airport_review_count, airport_default_score, how = "inner", on = "AIRPORT_CODE")
      3 airport_review_merged['Large'] = ~airport_review_merged.AIRPORT_CODE.isin(MEDIUM_AIRPORT_CODE)

NameError: name 'airport_review' is not defined
In [18]:
target_airport_df = pd.merge(
    target_df, 
    plane_review,
    how = "inner", 
    left_on = "TAIL_NUM", 
    right_on = "N-NUMBER",
    suffixes = ["", "_origin"]
)
"""
target_airport_df.rename(columns = {
    'reviewCount':'reviewCount_origin', 
    'ratingValue':'ratingValue_origin', 
    'queueTime':'queueTime_origin',
    'terminalCleanliness':'terminalCleanliness_origin', 
    'terminalSeating':'terminalSeating_origin', 
    'terminalSign':'terminalSign_origin',
    'foodBeverage':'foodBeverage_origin', 
    'airportShopping':'airportShopping_origin', 
    'wifiConnectivity':'wifiConnectivity_origin', 
    'airportStaff':'airportStaff_origin',
    'sentiment':'sentiment_origin', 
    'Large':'Large_origin'
}, inplace = True)
"""
target_airport_df.drop(columns = ['N-NUMBER'], inplace = True)
target_airport_df['CANCELLATION_CODE'] = target_airport_df['CANCELLATION_CODE'].fillna("Not")
target_airport_df.head()
Out[18]:
FL_DATE TAIL_NUM DEP_DELAY ARR_DELAY CANCELLED CANCELLATION_CODE CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY ... TYPE ENGINE MFR MODEL NO-ENG NO-SEATS AC-WEIGHT MFR_ENGINE MODEL_ENGINE HORSEPOWER THRUST
0 2014-07-01 N200AA 3.0 -13.0 0.0 Not NaN NaN NaN NaN ... Turbo-prop TEXTRON AVIATION INC 208B 1 12 Up to 12,499 P&W CANADA PT6A-140 867.0 0.0
1 2014-07-01 N200AA 5.0 1.0 0.0 Not NaN NaN NaN NaN ... Turbo-prop TEXTRON AVIATION INC 208B 1 12 Up to 12,499 P&W CANADA PT6A-140 867.0 0.0
2 2014-07-01 N201AA -5.0 -3.0 0.0 Not NaN NaN NaN NaN ... Reciprocating CESSNA 150 1 2 Up to 12,499 CONT MOTOR 0-200 SERIES 100.0 0.0
3 2014-07-01 N201AA -5.0 0.0 0.0 Not NaN NaN NaN NaN ... Reciprocating CESSNA 150 1 2 Up to 12,499 CONT MOTOR 0-200 SERIES 100.0 0.0
4 2014-07-01 N201AA -1.0 -2.0 0.0 Not NaN NaN NaN NaN ... Reciprocating CESSNA 150 1 2 Up to 12,499 CONT MOTOR 0-200 SERIES 100.0 0.0

5 rows × 23 columns

Visualization¶

Delay Trend¶

In [17]:
import plotly.express as px

delay_by_year = target_airport_df[[
    'FL_DATE',
    "DEP_DELAY",
    'CARRIER_DELAY',
    'WEATHER_DELAY', 
    'NAS_DELAY', 
    'SECURITY_DELAY',
    'LATE_AIRCRAFT_DELAY',
]]
delay_by_year['Year'] = delay_by_year['FL_DATE'].dt.year
delay_by_year = delay_by_year.groupby("Year")[[
    "DEP_DELAY",
    'CARRIER_DELAY',
    'WEATHER_DELAY', 
    'NAS_DELAY', 
    'SECURITY_DELAY',
    'LATE_AIRCRAFT_DELAY',
]].mean().reset_index()
delay_by_year = delay_by_year.melt(id_vars = "Year", value_name = "Delay (m)")

fig = px.area(
    delay_by_year, 
    x="Year", 
    y="Delay (m)",
    color ='variable',
    title='Airport Delay Trend in the Past 10Y',
)
fig.update_layout(
    height=600,
)
fig.show()
C:\Users\wongh\AppData\Local\Temp\ipykernel_5332\142654084.py:12: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Airport Review¶

First of all, let's have a look at the Review Data we collected

In [59]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

corr_matrix = airport_review_merged.loc[:, 'reviewCount':].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', mask = mask)
plt.title('Airport Review Score Correlation')
plt.show()
No description has been provided for this image

Summary

The majority of the score given by users are very correlated,
with the exception of reviewCount of an airport, which inversely correlate with all other attributes.

Airport Ranking¶

In [60]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots 
In [61]:
fig = go.Figure()

fig = px.scatter(
    data_frame = airport_review_merged,
    x='sentiment', 
    y='ratingValue',
    color='ratingValue',
    size = 'reviewCount',
    color_continuous_scale = 'viridis',
    facet_col = "Large",
    text='AIRPORT_CODE',
    log_y = True
)
fig.update_traces(textposition='top center')
fig.for_each_annotation(
    lambda a: a.update(text= "Medium Airport" if a.text == "Large=False" else "Large Airport")
)
fig.update_layout(
    height=600, 
    width=1000, 
    title_text="Airport Sentiment Visualization", 
    showlegend=False
)
fig.show()

Review Performance per Airport¶

In [62]:
df_columns = [
    'queueTime',
    'terminalCleanliness',
    'terminalSeating',
    'terminalSign',
    'foodBeverage',
    'airportShopping',
    'wifiConnectivity',
    'airportStaff',
]
fig = make_subplots(rows=4, cols=2, subplot_titles=df_columns)
col = [1, 2]*4
row = [1, 1, 2, 2, 3, 3, 4, 4]
for c, r, column in zip(col, row, df_columns):
    airport_review_merged.sort_values(column, ascending = False, inplace = True)
    trace = go.Bar(
        x=airport_review_merged['AIRPORT_CODE'], 
        y=airport_review_merged[column],
        marker=dict(
            color = airport_review_merged[column],
            colorscale='viridis'
        )
    )
    fig.add_trace(
        trace, 
        row=r, 
        col=c
    )
fig.update_layout(
    height=1000, 
    title_text="Individual Scoring Attributes of Airports",
    showlegend=False
)
fig.show()

Delay vs Ranking¶

In [21]:
target_airport_df.dtypes
Out[21]:
FL_DATE                datetime64[ns]
TAIL_NUM                       object
DEP_DELAY                     float64
ARR_DELAY                     float64
CANCELLED                     float64
CANCELLATION_CODE              object
CARRIER_DELAY                 float64
WEATHER_DELAY                 float64
NAS_DELAY                     float64
SECURITY_DELAY                float64
LATE_AIRCRAFT_DELAY           float64
COUNTRY                        object
TYPE AIRCRAFT                  object
TYPE ENGINE                    object
MFR                            object
MODEL                          object
NO-ENG                          int64
NO-SEATS                        int64
AC-WEIGHT                      object
MFR_ENGINE                     object
MODEL_ENGINE                   object
HORSEPOWER                    float64
THRUST                        float64
dtype: object
In [25]:
target_airport_df.nunique()
Out[25]:
FL_DATE                3653
TAIL_NUM               6798
DEP_DELAY              1757
ARR_DELAY              1778
CANCELLED                 2
CANCELLATION_CODE         5
CARRIER_DELAY          1525
WEATHER_DELAY           923
NAS_DELAY               820
SECURITY_DELAY          255
LATE_AIRCRAFT_DELAY    1212
COUNTRY                   4
TYPE AIRCRAFT             5
TYPE ENGINE               8
MFR                     101
MODEL                   293
NO-ENG                    7
NO-SEATS                 73
AC-WEIGHT                 4
MFR_ENGINE               33
MODEL_ENGINE            230
HORSEPOWER               55
THRUST                   99
dtype: int64
In [22]:
train_columns = [
    'NO-ENG',
    'NO-SEATS',
    'HORSEPOWER',
    'THRUST'
]
label_columns = [
    "DEP_DELAY",
    "ARR_DELAY",
    "CARRIER_DELAY",
    "WEATHER_DELAY",
    "NAS_DELAY",
    "SECURITY_DELAY",
    "LATE_AIRCRAFT_DELAY",
]
In [30]:
corr_matrix.head()
Out[30]:
AC-WEIGHT_12,500 - 19,999 AC-WEIGHT_20,000 and over AC-WEIGHT_UAV up to 55 AC-WEIGHT_Up to 12,499 COUNTRY_ COUNTRY_CA COUNTRY_GB COUNTRY_US HORSEPOWER NO-ENG ... TYPE AIRCRAFT_Rotorcraft TYPE AIRCRAFT_Weight-shift-control TYPE ENGINE_2 Cycle TYPE ENGINE_4 Cycle TYPE ENGINE_Electric TYPE ENGINE_Reciprocating TYPE ENGINE_Turbo-fan TYPE ENGINE_Turbo-jet TYPE ENGINE_Turbo-prop TYPE ENGINE_Turbo-shaft
NO-ENG 0.002914 0.246786 0.187011 -0.335489 0.046053 0.000558 -0.052084 -0.012037 0.006483 1.000000 ... 0.377504 -0.070834 -0.058386 -0.188460 0.747054 -0.444784 0.197079 0.005092 -0.140875 -0.089214
NO-SEATS -0.165297 0.304866 -0.033574 -0.252879 -0.011352 -0.007109 -0.021893 0.020456 -0.147924 0.079682 ... -0.098852 -0.027984 -0.023100 -0.078553 -0.064292 -0.202713 0.176289 -0.010630 -0.104905 -0.064378
HORSEPOWER -0.038858 0.032154 NaN -0.010212 0.036069 -0.007996 -0.000449 -0.020415 1.000000 0.006483 ... -0.001564 -0.004849 -0.003961 -0.009872 NaN -0.020284 0.031009 -0.034264 0.027150 0.003418
THRUST -0.054376 0.173511 NaN -0.176758 -0.027787 0.001945 -0.002652 0.018845 -0.751091 0.155881 ... -0.046561 -0.019958 -0.015674 -0.052285 NaN -0.145564 0.101029 0.014589 -0.080838 -0.037308
DEP_DELAY 0.009573 -0.005004 -0.000896 -0.000781 -0.000230 0.000295 0.001135 -0.000454 0.000385 0.000038 ... 0.000323 -0.000802 -0.000157 -0.001803 -0.000674 -0.000369 0.003089 -0.002669 -0.001903 0.001144

5 rows × 25 columns

In [32]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

cat_columns = ['COUNTRY', 'TYPE AIRCRAFT', 'TYPE ENGINE', 'AC-WEIGHT']
num_columns = ['NO-ENG','NO-SEATS','HORSEPOWER','THRUST'] + label_columns

# Create dummy variables for categorical columns
dummy_df = pd.get_dummies(target_airport_df[cat_columns])
# Combine dummy variables with numerical columns
new_df = pd.concat([target_airport_df[num_columns], dummy_df], axis=1)
# Assuming the DataFrame is called 'df'
corr_matrix = new_df.corr()
corr_matrix = corr_matrix.loc[corr_matrix.columns.difference(label_columns), label_columns]

# Create a heatmap using Seaborn
sns.set(style="white")
plt.figure(figsize=(16,10))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title('Correlation Matrix')
plt.show()
No description has been provided for this image

Delay Data Size >0

In [64]:
cal_df = target_airport_df[['ORIGIN'] + label_columns]
cal_df.groupby('ORIGIN')[label_columns].apply(lambda x: (x > 0).mean()).reset_index()
Out[64]:
ORIGIN DEP_DELAY ARR_DELAY CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY
0 ATL 0.391977 0.346680 0.102059 0.012486 0.098133 0.000559 0.070856
1 CLT 0.386671 0.377364 0.117649 0.012658 0.119450 0.002985 0.094161
2 DAL 0.511887 0.405451 0.138866 0.010460 0.092957 0.001003 0.135679
3 DEN 0.442719 0.386610 0.127630 0.010701 0.112139 0.000455 0.110875
4 DFW 0.417090 0.400279 0.137167 0.018804 0.118154 0.000864 0.105878
5 EWR 0.401861 0.362531 0.133538 0.011004 0.122859 0.000979 0.107049
6 HOU 0.461493 0.377129 0.120927 0.009179 0.084441 0.000970 0.115730
7 JFK 0.357083 0.337509 0.119247 0.010789 0.129858 0.001389 0.068300
8 LAS 0.400518 0.375535 0.111114 0.005143 0.108687 0.000699 0.113227
9 LAX 0.384964 0.363739 0.104485 0.004377 0.102161 0.000770 0.096527
10 MCO 0.390880 0.372180 0.110292 0.020802 0.120200 0.001602 0.106974
11 MIA 0.405724 0.393572 0.136413 0.019572 0.126643 0.001830 0.099147
12 MSY 0.360981 0.338699 0.087303 0.010217 0.093822 0.000673 0.106858
13 ORD 0.405576 0.382400 0.121694 0.021180 0.129221 0.000579 0.098953
14 PDX 0.298812 0.321973 0.066115 0.004676 0.084625 0.000397 0.068284
15 PHX 0.392451 0.362283 0.101458 0.003507 0.090475 0.001300 0.098925
16 RDU 0.344065 0.332790 0.086812 0.008671 0.104936 0.000614 0.091356
17 SEA 0.351422 0.362239 0.082688 0.006774 0.105163 0.000757 0.068566
18 SFO 0.369786 0.367693 0.098870 0.004128 0.108372 0.000497 0.107824
19 SJC 0.340300 0.339721 0.079905 0.003123 0.076541 0.000883 0.098579
20 SJU 0.347668 0.355347 0.117447 0.002182 0.102229 0.001834 0.121220
21 SMF 0.358445 0.343033 0.077687 0.003122 0.080243 0.000597 0.100190
22 SNA 0.352813 0.332351 0.068681 0.004160 0.083964 0.000328 0.098166
23 STL 0.390224 0.347997 0.098000 0.008861 0.093257 0.000460 0.101216
In [23]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

corr_matrix = target_airport_df.loc[:, train_columns + label_columns].corr()

plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays')
plt.show()

plt.figure(figsize=(10, 5))
label_columns.remove('NAS_DELAY')
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays (Without NAS)')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [14]:
import plotly.express as px

fig = px.box(
    target_airport_df.sample(1000000, replace = False), 
    x="DEST", 
    y="NAS_DELAY",
    log_y = True,
    color = f"Large_dest",
    color_discrete_map = {True: 'red', False: 'blue'}
)
fig.update_layout(
    height=400, 
    title_text=f"Destination Aiport vs NAS Delay",
    showlegend=True
)
fig.show()

Departure Delay Per Airport¶

In [21]:
for airport_direction in ["ORIGIN", "DEST"]:
    airport_temp = target_airport_df.sample(1000000, replace = False).groupby(airport_direction)['DEP_DELAY'].quantile(0.75).reset_index()
    airport_temp_merged = pd.merge(
        target_airport_df.sample(100000, replace = False), 
        airport_temp, how = "inner", 
        on = airport_direction, 
        suffixes = ["", "_mean"]
    )
    airport_temp_merged['DEP_DELAY'] = airport_temp_merged['DEP_DELAY'].apply(lambda x: 0 if x < 0 else x)
    fig = px.box(
        airport_temp_merged.sort_values([
            f"Large_{airport_direction.lower()}", 
            "DEP_DELAY_mean"
        ], ascending = False), 
        x=airport_direction, 
        y="DEP_DELAY",
        log_y = True,
        color = f"Large_{airport_direction.lower()}",
        color_discrete_map = {True: 'red', False: 'blue'}
    )
    fig.update_layout(
        height=400, 
        title_text=f"{airport_direction} Aiport Departure Delay",
        showlegend=True
    )
    fig.show()

Cancelled¶

In [67]:
train_columns = [
    'queueTime_origin',
    'terminalCleanliness_origin',
    'terminalSeating_origin',
    'terminalSign_origin',
    'foodBeverage_origin',
    'airportShopping_origin',
    'wifiConnectivity_origin',
    'airportStaff_origin',
    'Large_origin',
    'queueTime_dest',
    'terminalCleanliness_dest',
    'terminalSeating_dest',
    'terminalSign_dest',
    'foodBeverage_dest',
    'airportShopping_dest',
    'wifiConnectivity_dest',
    'airportStaff_dest',
    'Large_dest'
]
label_columns = [
    "Cancellation_A",
    "Cancellation_B",
    "Cancellation_C",
    "Cancellation_D",
    "Cancellation_Not"
]
In [69]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

temp = pd.concat([
    target_airport_df, 
    pd.get_dummies(target_airport_df['CANCELLATION_CODE'], prefix = "Cancellation")
], axis = 1)

corr_matrix = temp.loc[:, train_columns + label_columns].corr()

plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays')
plt.show()
No description has been provided for this image
In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm import tqdm
import numpy as np
import math
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize

def preprocess(df, y_columns='ARR_DELAY'):
    X = df.loc[:, train_columns]
    y = df.loc[:, y_columns]
    return X, y
In [ ]:
train, test = train_test_split(target_airport_df, test_size=0.2)
In [72]:
train_X, train_y = preprocess(train, "CANCELLATION_CODE")
test_X, test_y = preprocess(test, "CANCELLATION_CODE")

min_max_scaler = preprocessing.MinMaxScaler()
train_X_scaled = min_max_scaler.fit_transform(train_X)
test_X_scaled = min_max_scaler.transform(test_X)
In [73]:
model = LogisticRegression(
    multi_class='ovr', 
    max_iter=1000,
    class_weight="balanced"
)
model.fit(train_X_scaled, train_y)
C:\Users\wongh\anaconda3\envs\601\lib\site-packages\sklearn\linear_model\_logistic.py:1256: FutureWarning:

'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.

Out[73]:
LogisticRegression(class_weight='balanced', max_iter=1000, multi_class='ovr')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(class_weight='balanced', max_iter=1000, multi_class='ovr')
In [74]:
import seaborn as sn
import matplotlib.pyplot as plt

train_predictions = model.predict(train_X_scaled)
test_predictions = model.predict(test_X_scaled)

train_conf_matrix = confusion_matrix(train_y, train_predictions)
test_conf_matrix = confusion_matrix(test_y, test_predictions)

sn.heatmap(train_conf_matrix, annot=True, fmt='g')
plt.show()

sn.heatmap(test_conf_matrix, annot=True, fmt='g')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [76]:
train_y_encoded = label_binarize(train_y, classes=model.classes_)
test_y_encoded = label_binarize(test_y, classes=model.classes_)

train_probs = model.predict_proba(train_X_scaled)
test_probs = model.predict_proba(test_X_scaled)

fpr_train, tpr_train, _ = roc_curve(train_y_encoded.ravel(), train_probs.ravel())
fpr_test, tpr_test, _ = roc_curve(test_y_encoded.ravel(), test_probs.ravel())

train_auc = roc_auc_score(train_y_encoded, train_probs, average='micro')
test_auc = roc_auc_score(test_y_encoded, test_probs, average='micro')
In [77]:
def roc(fpr, tpr, auc):
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(
        fpr, 
        tpr, 
        label=f'Train ROC curve (area = {auc:.2})'
    )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Train ROC Curve')
    plt.legend(loc="lower right")

roc(fpr_train, tpr_train, train_auc)
roc(fpr_test, tpr_test, test_auc)
No description has been provided for this image
No description has been provided for this image
In [78]:
weight = model.coef_

fig, axs = plt.subplots(5, 1, figsize=(10, 20))

for i in range(5):
    ax = axs[i]
    ax.barh(train_columns, weight[i])
    ax.set_title(f'Bar Chart {label_columns[i]}')

plt.tight_layout()
plt.show()
No description has been provided for this image

from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn import preprocessing from tqdm import tqdm import numpy as np import math

def preprocess(df, y_columns = 'ARR_DELAY'): df.dropna(subset = y_columns, inplace = True) X = df.loc[:, train_columns] y = df.loc[:, y_columns] return X, y

def experiment(df, y_columns = 'ARR_DELAY'): train, test = train_test_split(df, test_size=0.2)

train_X, train_y = preprocess(train, y_columns)
test_X, test_y = preprocess(test, y_columns)

min_max_scaler = preprocessing.MinMaxScaler()
train_X_scaled = min_max_scaler.fit_transform(train_X)
test_X_scaled = min_max_scaler.transform(test_X)

model = LinearRegression()
model.fit(train_X_scaled, train_y)

train_r_score = model.score(train_X_scaled, train_y)
test_r_score = model.score(test_X_scaled, test_y)

return train_r_score, test_r_score, model.coef_

Redacted¶

from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn import preprocessing from tqdm import tqdm import numpy as np import math

def preprocess(df, y_columns = 'ARR_DELAY'): df.dropna(subset = y_columns, inplace = True) X = df.loc[:, train_columns] y = df.loc[:, y_columns] return X, y

def experiment(df, y_columns = 'ARR_DELAY'): train, test = train_test_split(df, test_size=0.2)

train_X, train_y = preprocess(train, y_columns)
test_X, test_y = preprocess(test, y_columns)

min_max_scaler = preprocessing.MinMaxScaler()
train_X_scaled = min_max_scaler.fit_transform(train_X)
test_X_scaled = min_max_scaler.transform(test_X)

model = LinearRegression()
model.fit(train_X_scaled, train_y)

train_r_score = model.score(train_X_scaled, train_y)
test_r_score = model.score(test_X_scaled, test_y)

return train_r_score, test_r_score, model.coef_

Arrival Delay Reasoning¶

train_r_list, test_r_list, coef_list = list(), list(), list() for i in tqdm(range(EXPERIMENT_COUNT)): np.random.seed(i) train_r_score, test_r_score, coef = experiment(target_airport_df, 'LOG_ARR_DELAY') train_r_list.append(train_r_score) test_r_list.append(test_r_score) coef_list.append(coef)

import plotly.express as px

coef_df_list = list()

for i, array in enumerate(coef_list): coef_df = pd.DataFrame({ "Feature": train_columns, "Weight": array }) coef_df_list.append(coef_df)

result_df = pd.concat(coef_df_list)

fig = px.box( result_df, x="Feature", y="Weight", ) fig.update_layout( height=600, title_text="Aiport Rating vs Arrival Delay", showlegend=False ) fig.show()

In [19]:
train_r_list, test_r_list
Out[19]:
([0.0027769056925236457,
  0.002726794565260926,
  0.002739984286952546,
  0.0027382477593851373,
  0.002750673837592532,
  0.0027616015050112575,
  0.002722193250161431,
  0.002763664872232119,
  0.0027681168845542503,
  0.002773670861719535],
 [0.0026226756634710746,
  0.0028214286965265734,
  0.0027696415080686876,
  0.002776677393967586,
  0.0027283399081202697,
  0.0026838710483376715,
  0.0028416825308008598,
  0.0026738151940109978,
  0.00265653353537032,
  0.002636301651983919])

Departure Delay Reasoning¶

train_r_list, test_r_list, coef_list = list(), list(), list() for i in tqdm(range(EXPERIMENT_COUNT)): np.random.seed(i) train_r_score, test_r_score, coef = experiment(target_airport_df, 'DEP_DELAY') train_r_list.append(train_r_score) test_r_list.append(test_r_score) coef_list.append(coef)

import plotly.express as px

coef_df_list = list()

for i, array in enumerate(coef_list): coef_df = pd.DataFrame({ "Feature": train_columns, "Weight": array }) coef_df_list.append(coef_df)

result_df = pd.concat(coef_df_list)

fig = px.box( result_df, x="Feature", y="Weight", ) fig.update_layout( height=600, title_text="Aiport Rating vs Departure Delay", showlegend=False ) fig.show()

In [ ]:
 

import pandas as pd import plotly.graph_objects as go import plotly.express as px

fig = go.Figure()

coef_df.sort_values("Weight", ascending = False, inplace = True) fig = px.bar( data_frame = coef_df, x='Feature', y='Weight', color='Weight', color_continuous_scale = 'viridis', ) fig.update_layout( height=600, width=1000, title_text="Airport Sentiment Visualization", showlegend=False ) fig.show()

import math

target_airport_df["LOG_ARR_DELAY"] = target_airport_df["ARR_DELAY"].apply(lambda x: 0 if x <= 0 else math.log(x)) target_airport_df["LOG_DEP_DELAY"] = target_airport_df["DEP_DELAY"].apply(lambda x: 0 if x <= 0 else math.log(x)) target_airport_df["LOG_ARR_DELAY"]

In [ ]: